# Libraries
import pandas as pd
pd.set_option('display.max_rows', 500)
import os
import matplotlib.pyplot as plt
from PIL import Image
From Github: https://github.com/ncsu-landscape-dynamics/gsv_host_detector/tree/main/tree_inventory
# The AutoArborist dataset contains genus-level images and locations.
# This data was merged to city tree inventories with species-level location data by latitude/ longitude
aa_data = pd.read_csv(r"C:/users/talake2/downloads/AutoArboristData.csv", low_memory=False)
aa_data.head()
# For each genus in the 'GENUS' column from AutoArborist,
# How often does the 'GENUS' match the 'genus_name' column from the tree inventories?
# Match 'GENUS' and 'genus_name' columns by string
aa_data['match'] = aa_data['GENUS'].str.lower() == aa_data['genus_name'].str.lower()
# Calculate the proportion of matches per genus
# How often does the 'GENUS' match the 'genus_name' column from the tree inventories?
match_summary = aa_data.groupby('GENUS').agg(
total_records=('match', 'size'),
match_count=('match', 'sum')
)
# Calculate proportion of matching columns and filter
match_summary['match_proportion'] = match_summary['match_count'] / match_summary['total_records']
match_summary = match_summary[match_summary['total_records'] >= 500]
# Display the summary
match_summary.reset_index(inplace=True)
match_summary
# Summarize Data - Proportion of Matches between Autoarborist and Tree Inventories for Genera and Cities
match_summary_city_genus = aa_data.groupby(['City', 'GENUS']).agg(
total_records=('match', 'size'),
match_count=('match', 'sum')
)
match_summary_city_genus['match_proportion'] = match_summary_city_genus['match_count'] / match_summary_city_genus['total_records']
# Filter out rows with less than 500 total_records
match_summary_city_genus = match_summary_city_genus[match_summary_city_genus['total_records'] >= 500]
# Sort by match_proportion in descending order
match_summary_city_genus = match_summary_city_genus.sort_values(by='match_proportion', ascending=False).reset_index()
print(match_summary_city_genus[match_summary_city_genus['GENUS'] == 'acer'])
# Summarize Data - Proportion of Matches between Autoarborist and Tree Inventories for Genera and Cities
match_summary_city_genus = aa_data.groupby(['City', 'GENUS']).agg(
total_records=('match', 'size'),
match_count=('match', 'sum')
)
match_summary_city_genus['match_proportion'] = match_summary_city_genus['match_count'] / match_summary_city_genus['total_records']
# Filter out rows with less than 500 total_records
match_summary_city_genus = match_summary_city_genus[match_summary_city_genus['total_records'] >= 500]
# Sort by match_proportion in descending order
match_summary_city_genus = match_summary_city_genus.sort_values(by='match_proportion', ascending=False).reset_index()
print(match_summary_city_genus[match_summary_city_genus['GENUS'] == 'juglans'])
# Filter AutoArborist by 'GENUS' column reported in AutoArborist
acer_data = aa_data[aa_data['GENUS'].str.lower() == 'acer']
# Filter Tree Inventory data by 'genus_name' column reported in tree inventories
acer_data = acer_data[acer_data['genus_name'].str.lower() == 'acer']
# Filter Tree Inventory data by 'species_name' column reported in tree inventories
red_maple_data = acer_data[acer_data['species_name'].str.lower() == 'rubrum']
print(f"There are ", len(red_maple_data), "records for red maple")
sugar_maple_data = acer_data[acer_data['species_name'].str.lower() == 'saccharum']
print(f"There are ", len(sugar_maple_data), "records for sugar maple")
silver_maple_data = acer_data[acer_data['species_name'].str.lower() == 'saccharinum']
print(f"There are ", len(silver_maple_data), "records for silver maple")
norway_maple_data = acer_data[acer_data['species_name'].str.lower() == 'platanoides']
print(f"There are ", len(norway_maple_data), "records for norway maple")
# Filter AutoArborist by 'GENUS column reported in AutoArborist
juglans_data = aa_data[aa_data['GENUS'].str.lower() == 'juglans']
# Filter Tree Inventory data by 'genus_name' column reported in tree inventories
juglans_data = juglans_data[juglans_data['genus_name'].str.lower() == 'juglans']
black_walnut_data = juglans_data[juglans_data['species_name'].str.lower() == 'nigra']
print(f"There are ", len(black_walnut_data), "records for black walnut")
# Define the base directory for images
base_image_dir = r"C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/jpegs_streetlevel_genus_idx_label"
# Function to display 20 images in a 4x5 grid
def display_species_images_grid(data, num_images=20):
# Limit the data to the number of images to display
data = data.head(num_images)
# Set up a 4x5 grid
fig, axes = plt.subplots(4, 5, figsize=(20, 16))
axes = axes.ravel() # Flatten the 2D array of axes for easy iteration
for i, (_, row) in enumerate(data.iterrows()):
# Construct the file path based on the 'type', 'GENUS', and 'IDX' columns
image_path = os.path.join(base_image_dir, row['type'], row['GENUS'].lower(), 'images', f"{row['IDX']}.jpeg")
# Check if the image file exists
if os.path.isfile(image_path):
img = Image.open(image_path)
axes[i].imshow(img)
axes[i].axis('off') # Hide axes for cleaner display
axes[i].set_title(f"{row['genus_name']} - {row['species_name']}")
else:
print(f"Image not found for IDX: {row['IDX']}")
axes[i].axis('off') # Hide axes if no image is found
# Turn off any extra unused axes
for j in range(i + 1, len(axes)):
axes[j].axis('off')
plt.tight_layout()
plt.show()
display_species_images_grid(red_maple_data)
display_species_images_grid(sugar_maple_data)
display_species_images_grid(silver_maple_data)
display_species_images_grid(norway_maple_data)
display_species_images_grid(black_walnut_data)